# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : 批量写入数据(单线程原始bulk写入).py
# @Author: dongguangwen
# @Date  : 2025-05-31 18:12
from elasticsearch import Elasticsearch
import json
import time

es = Elasticsearch(hosts='http://192.168.1.119:9200')

actions = []

start = time.time()


# 构造Bulk API需要的数据格式
def make_bulk_data(line):
    bulk_data = ''
    bulk_data += '{"index": {"_index": "persons", "_id": "%s"}}\n' % line.get('id')
    doc = {
        "id": line.get('id'),
        "name": line.get('name'),
        "sex": line.get('sex'),
        "age": line.get('age'),
        "email": line.get('email'),
        "simple_profile": {
            "username": line.get('simple_profile').get('username'),
            "name": line.get('simple_profile').get('name'),
            "sex": line.get('simple_profile').get('sex'),
            "address": line.get('simple_profile').get('address'),
            "mail": line.get('simple_profile').get('mail'),
            "birthdate": line.get('simple_profile').get('birthdate'),
        },
        "profile": {
            "job": line.get('profile').get('job'),
            "company": line.get('profile').get('company'),
            "ssn": line.get('profile').get('ssn'),
            "residence": line.get('profile').get('residence'),
            "blood_group": line.get('profile').get('blood_group'),
            "website": line.get('profile').get('website'),
            "username": line.get('profile').get('username'),
            "name": line.get('profile').get('name'),
            "sex": line.get('profile').get('sex'),
            "address": line.get('profile').get('address'),
            "mail": line.get('profile').get('mail'),
            "birthdate": line.get('profile').get('birthdate'),
        },
        "location": {
            "country": line.get('country'),
            "province": line.get('province'),
            "city": line.get('city'),
            "street_name": line.get('street_name'),
            "street_address": line.get('street_address'),
            "address": line.get('address'),
            "postcode": line.get('postcode'),
            "gps": line.get('gps')
        },
        "net_info": {
            "ipv4": line.get('ipv4'),
            "ipv6": line.get('ipv6'),
            "uri": line.get('uri'),
            "url": line.get('url'),
            "img_url": line.get('img_url'),
            "domain": line.get('domain'),
            "user_agent": line.get('user_agent')
        },
        "character": line.get('character'),
        "long_text": line.get('long_text'),
        "time": line.get('time')
    }
    bulk_data += json.dumps(doc) + '\n'
    return bulk_data


with open('./person_info_100w.json', 'r', encoding='utf-8') as f:
    bulk_lines = ''
    for line in f:
        line = json.loads(line)
        bulk_line = make_bulk_data(line)
        bulk_lines += bulk_line
        if len(bulk_lines.encode('utf-8')) >= 10 * 1024 * 1024:  # 控制每次bulk请求大小大约为10MB
            es.bulk(body=bulk_lines)
            print(f"已处理ID: {line.get('id')}")
            bulk_lines = ''
    if bulk_lines:
        es.bulk(body=bulk_lines)

end = time.time()
print('花费时间：', end - start)

"""
花费时间： 3039.9836118221283
"""